Assignment 1

The goal of this assignment is to create a linear regression model of baseball team statistics to predict the number of wins for a given team.

Following example: http://www.sthda.com/english/articles/40-regression-analysis/168-multiple-linear-regression-in-r/#:~:text=Multiple%20linear%20regression%20is%20an,distinct%20predictor%20variables%20(x).&text=The%20%E2%80%9Cb%E2%80%9D%20values%20are%20called,weights%20(or%20beta%20coefficients).

Following approach here, too: https://machinelearningmastery.com/machine-learning-in-r-step-by-step/

library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.3.2     ✔ purrr   0.3.3
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.2     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.6.2
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggplot2)
library(GGally)
## Warning: package 'GGally' was built under R version 3.6.2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(ggExtra)
# Read in CSV file of training dat
mb_train_init <- read.csv("moneyball-training-data.csv")
# Dimensions of the dataset
dim(mb_train_init)
## [1] 2276   17
# list types of each attribute
sapply(mb_train_init, class)
##            INDEX      TARGET_WINS   TEAM_BATTING_H  TEAM_BATTING_2B 
##        "integer"        "integer"        "integer"        "integer" 
##  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB  TEAM_BATTING_SO 
##        "integer"        "integer"        "integer"        "integer" 
##  TEAM_BASERUN_SB  TEAM_BASERUN_CS TEAM_BATTING_HBP  TEAM_PITCHING_H 
##        "integer"        "integer"        "integer"        "integer" 
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E 
##        "integer"        "integer"        "integer"        "integer" 
## TEAM_FIELDING_DP 
##        "integer"
# first five rows
head(mb_train_init)
##   INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1     1          39           1445             194              39
## 2     2          70           1339             219              22
## 3     3          86           1377             232              35
## 4     4          70           1387             209              38
## 5     5          82           1297             186              27
## 6     6          75           1279             200              36
##   TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1              13             143             842              NA
## 2             190             685            1075              37
## 3             137             602             917              46
## 4              96             451             922              43
## 5             102             472             920              49
## 6              92             443             973             107
##   TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1              NA               NA            9364               84
## 2              28               NA            1347              191
## 3              27               NA            1377              137
## 4              30               NA            1396               97
## 5              39               NA            1297              102
## 6              59               NA            1279               92
##   TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1              927             5456            1011               NA
## 2              689             1082             193              155
## 3              602              917             175              153
## 4              454              928             164              156
## 5              472              920             138              168
## 6              443              973             123              149

Outputting first 5 instances for simple visual inspection. Initial observation, TEAM_BATTING_HBP is NA (not available) for each instance. Will need further investigation.

# summary
summary(mb_train_init)
##      INDEX         TARGET_WINS     TEAM_BATTING_H TEAM_BATTING_2B
##  Min.   :   1.0   Min.   :  0.00   Min.   : 891   Min.   : 69.0  
##  1st Qu.: 630.8   1st Qu.: 71.00   1st Qu.:1383   1st Qu.:208.0  
##  Median :1270.5   Median : 82.00   Median :1454   Median :238.0  
##  Mean   :1268.5   Mean   : 80.79   Mean   :1469   Mean   :241.2  
##  3rd Qu.:1915.5   3rd Qu.: 92.00   3rd Qu.:1537   3rd Qu.:273.0  
##  Max.   :2535.0   Max.   :146.00   Max.   :2554   Max.   :458.0  
##                                                                  
##  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB TEAM_BATTING_SO 
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.0   Min.   :   0.0  
##  1st Qu.: 34.00   1st Qu.: 42.00   1st Qu.:451.0   1st Qu.: 548.0  
##  Median : 47.00   Median :102.00   Median :512.0   Median : 750.0  
##  Mean   : 55.25   Mean   : 99.61   Mean   :501.6   Mean   : 735.6  
##  3rd Qu.: 72.00   3rd Qu.:147.00   3rd Qu.:580.0   3rd Qu.: 930.0  
##  Max.   :223.00   Max.   :264.00   Max.   :878.0   Max.   :1399.0  
##                                                    NA's   :102     
##  TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
##  Min.   :  0.0   Min.   :  0.0   Min.   :29.00    Min.   : 1137  
##  1st Qu.: 66.0   1st Qu.: 38.0   1st Qu.:50.50    1st Qu.: 1419  
##  Median :101.0   Median : 49.0   Median :58.00    Median : 1518  
##  Mean   :124.8   Mean   : 52.8   Mean   :59.36    Mean   : 1779  
##  3rd Qu.:156.0   3rd Qu.: 62.0   3rd Qu.:67.00    3rd Qu.: 1682  
##  Max.   :697.0   Max.   :201.0   Max.   :95.00    Max.   :30132  
##  NA's   :131     NA's   :772     NA's   :2085                    
##  TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E 
##  Min.   :  0.0    Min.   :   0.0   Min.   :    0.0   Min.   :  65.0  
##  1st Qu.: 50.0    1st Qu.: 476.0   1st Qu.:  615.0   1st Qu.: 127.0  
##  Median :107.0    Median : 536.5   Median :  813.5   Median : 159.0  
##  Mean   :105.7    Mean   : 553.0   Mean   :  817.7   Mean   : 246.5  
##  3rd Qu.:150.0    3rd Qu.: 611.0   3rd Qu.:  968.0   3rd Qu.: 249.2  
##  Max.   :343.0    Max.   :3645.0   Max.   :19278.0   Max.   :1898.0  
##                                    NA's   :102                       
##  TEAM_FIELDING_DP
##  Min.   : 52.0   
##  1st Qu.:131.0   
##  Median :149.0   
##  Mean   :146.4   
##  3rd Qu.:164.0   
##  Max.   :228.0   
##  NA's   :286

Display summary of all the variables (statistics)

Variables containing at least one NA value

  • TEAM_BATTING_SO
  • TEAM_BASERUN_SB
  • TEAM_BASERUN_CS
  • TEAM_BATTING_HBP (majority of instances contain NA)
  • TEAM_PITCHING_SO
  • TEAM_FIELDING_DP

Interesting observations

  • Minimum number of wins is 0: Given a 162 game schedule, I don’t believe 0 wins is a realistic value
  • Maximum number of wins is 146: Given a 162 game schedule, this is an unrealistic number. Most wins by a MLB team is 116. 1906 Chicago Cubs and 2001 Seattle Mariners
  • Maximum number of TEAM_PITCHING_H is 30132: This indicates a team gave up over 180 hits per game. That’s completely urealistic.
  • Maximum number of TEAM_PITCHING_SO is 19278: This indicates a team achieved 119 striketouts per game. Literally impossible.
# More Visualize
ggpairs(mb_train_init[,3:7], color="gray20")
## Warning in warn_if_args_exist(list(...)): Extra arguments: 'color' are
## being ignored. If these are meant to be aesthetics, submit them using the
## 'mapping' variable within ggpairs with ggplot2::aes or ggplot2::aes_string.

Not much value in above visualization

p1 <- ggplot(mb_train_init, aes(x=TEAM_BATTING_H, y=TARGET_WINS)) + 
  geom_point() +
  geom_smooth(method=lm) + 
  labs(x = "Batting Hits", y = "Wins", title="Wins by Total Hits")
# Scatterplot with density plot
ggMarginal(p1, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

# Scatterplot with boxplot
ggMarginal(p1, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

p2 <- ggplot(mb_train_init, aes(x=TEAM_BATTING_2B, y=TARGET_WINS)) + 
  geom_point()+
  geom_smooth(method=lm) + 
  labs(x = "Batting Doubles", y = "Wins", title="Wins by Doubles")
# Scatterplot with density plot
ggMarginal(p2, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

# Scatterplot with boxplot
ggMarginal(p2, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

p3 <- ggplot(mb_train_init, aes(x=TEAM_BATTING_3B, y=TARGET_WINS)) + 
  geom_point()+
  geom_smooth(method=lm) + 
  labs(x = "Batting Triples", y = "Wins", title="Wins by Triples")
# Scatterplot with density plot
ggMarginal(p3, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

# Scatterplot with boxplot
ggMarginal(p3, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

p4 <- ggplot(mb_train_init, aes(x=TEAM_BATTING_HR, y=TARGET_WINS)) + 
  geom_point()+
  geom_smooth(method=lm) + 
  labs(x = "Batting Home Runs", y = "Wins", title="Wins by Home Runs")
# Scatterplot with density plot
ggMarginal(p4, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

# Scatterplot with boxplot
ggMarginal(p4, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

p5 <- ggplot(mb_train_init, aes(x=TEAM_BATTING_BB, y=TARGET_WINS)) + 
  geom_point()+
  geom_smooth(method=lm) + 
  labs(x = "Batting Walks", y = "Wins", title="Wins by Walks")
# Scatterplot with density plot
ggMarginal(p5, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

# Scatterplot with boxplot
ggMarginal(p5, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

p6 <- ggplot(mb_train_init, aes(x=TEAM_BATTING_SO, y=TARGET_WINS)) + 
  geom_point()+
  geom_smooth(method=lm) + 
  labs(x = "Batting Strikeouts", y = "Wins", title="Wins by Strikeouts")
# Scatterplot with density plot
ggMarginal(p6, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 102 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 102 rows containing non-finite values (stat_smooth).
## Warning: Removed 102 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 102 rows containing non-finite values (stat_smooth).

## Warning: Removed 102 rows containing missing values (geom_point).

# Scatterplot with boxplot
ggMarginal(p6, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 102 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 102 rows containing non-finite values (stat_smooth).
## Warning: Removed 102 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 102 rows containing non-finite values (stat_smooth).

## Warning: Removed 102 rows containing missing values (geom_point).

p7 <- ggplot(mb_train_init, aes(x=TEAM_BATTING_HBP, y=TARGET_WINS)) + 
  geom_point()+
  geom_smooth(method=lm) + 
  labs(x = "Batting HBP", y = "Wins", title="Wins by HBP")
# Scatterplot with density plot
ggMarginal(p7, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2085 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2085 rows containing non-finite values (stat_smooth).
## Warning: Removed 2085 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2085 rows containing non-finite values (stat_smooth).

## Warning: Removed 2085 rows containing missing values (geom_point).

# Scatterplot with boxplot
ggMarginal(p7, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2085 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2085 rows containing non-finite values (stat_smooth).
## Warning: Removed 2085 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2085 rows containing non-finite values (stat_smooth).

## Warning: Removed 2085 rows containing missing values (geom_point).

p8 <- ggplot(mb_train_init, aes(x=TEAM_BASERUN_SB, y=TARGET_WINS)) + 
  geom_point()+
  geom_smooth(method=lm) + 
  labs(x = "Base running Stolen Bases", y = "Wins", title="Wins by Stolen Bases")
# Scatterplot with density plot
ggMarginal(p8, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 131 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 131 rows containing non-finite values (stat_smooth).
## Warning: Removed 131 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 131 rows containing non-finite values (stat_smooth).

## Warning: Removed 131 rows containing missing values (geom_point).

# Scatterplot with boxplot
ggMarginal(p8, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 131 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 131 rows containing non-finite values (stat_smooth).
## Warning: Removed 131 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 131 rows containing non-finite values (stat_smooth).

## Warning: Removed 131 rows containing missing values (geom_point).

p9 <- ggplot(mb_train_init, aes(x=TEAM_BASERUN_CS, y=TARGET_WINS)) + 
  geom_point()+
  geom_smooth(method=lm) + 
  labs(x = "Base running Caught Stealing", y = "Wins", title="Wins by Caught Stealing")
# Scatterplot with density plot
ggMarginal(p9, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 772 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 772 rows containing non-finite values (stat_smooth).
## Warning: Removed 772 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 772 rows containing non-finite values (stat_smooth).

## Warning: Removed 772 rows containing missing values (geom_point).

# Scatterplot with boxplot
ggMarginal(p9, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 772 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 772 rows containing non-finite values (stat_smooth).
## Warning: Removed 772 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 772 rows containing non-finite values (stat_smooth).

## Warning: Removed 772 rows containing missing values (geom_point).

p10 <- ggplot(mb_train_init, aes(x=TEAM_PITCHING_H, y=TARGET_WINS)) + 
  geom_point()+
  geom_smooth(method=lm) + 
  labs(x = "Pitching Hits Allowed", y = "Wins", title="Wins by Hits Allowed")
# Scatterplot with density plot
ggMarginal(p10, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

# Scatterplot with boxplot
ggMarginal(p10, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

p11 <- ggplot(mb_train_init, aes(x=TEAM_PITCHING_HR, y=TARGET_WINS)) + 
  geom_point()+
  geom_smooth(method=lm) + 
  labs(x = "Pitching Home Runs Allowed", y = "Wins", title="Wins by Home Runs Allowed")
# Scatterplot with density plot
ggMarginal(p11, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

# Scatterplot with boxplot
ggMarginal(p11, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

p12 <- ggplot(mb_train_init, aes(x=TEAM_PITCHING_BB, y=TARGET_WINS)) + 
  geom_point()+
  geom_smooth(method=lm) + 
  labs(x = "Pitching Walks Allowed", y = "Wins", title="Wins by Walks Allowed")
# Scatterplot with density plot
ggMarginal(p12, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

# Scatterplot with boxplot
ggMarginal(p12, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

p13 <- ggplot(mb_train_init, aes(x=TEAM_PITCHING_SO, y=TARGET_WINS)) + 
  geom_point()+
  geom_smooth(method=lm) + 
  labs(x = "Pitching Strikeouts", y = "Wins", title="Wins by Strikeouts")
# Scatterplot with density plot
ggMarginal(p13, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 102 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 102 rows containing non-finite values (stat_smooth).
## Warning: Removed 102 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 102 rows containing non-finite values (stat_smooth).

## Warning: Removed 102 rows containing missing values (geom_point).

# Scatterplot with boxplot
ggMarginal(p13, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 102 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 102 rows containing non-finite values (stat_smooth).
## Warning: Removed 102 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 102 rows containing non-finite values (stat_smooth).

## Warning: Removed 102 rows containing missing values (geom_point).

p14 <- ggplot(mb_train_init, aes(x=TEAM_FIELDING_E, y=TARGET_WINS)) + 
  geom_point()+
  geom_smooth(method=lm) + 
  labs(x = "Fielding Errors", y = "Wins", title="Wins by Errors Commited")
# Scatterplot with density plot
ggMarginal(p14, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

# Scatterplot with boxplot
ggMarginal(p14, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

p15 <- ggplot(mb_train_init, aes(x=TEAM_FIELDING_DP, y=TARGET_WINS)) + 
  geom_point()+
  geom_smooth(method=lm) + 
  labs(x = "Fielding Double Plays", y = "Wins", title="Wins by Defensive Double Plays")
# Scatterplot with density plot
ggMarginal(p15, type = "density")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 286 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 286 rows containing non-finite values (stat_smooth).
## Warning: Removed 286 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 286 rows containing non-finite values (stat_smooth).

## Warning: Removed 286 rows containing missing values (geom_point).

# Scatterplot with boxplot
ggMarginal(p15, type = "boxplot")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 286 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 286 rows containing non-finite values (stat_smooth).
## Warning: Removed 286 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 286 rows containing non-finite values (stat_smooth).

## Warning: Removed 286 rows containing missing values (geom_point).

Data Exploration Notes

# Consider all hits as one variable: total bases

# First, identify the number of singles as Hits represents all hit types combined
mb_train_init$TEAM_BATTING_1B <- mb_train_init$TEAM_BATTING_H - mb_train_init$TEAM_BATTING_2B - mb_train_init$TEAM_BATTING_3B - mb_train_init$TEAM_BATTING_HR

mb_train_init$TOTAL_BASES <-  mb_train_init$TEAM_BATTING_1B 

mb_train_init$TOTAL_BASES <- (mb_train_init$TEAM_BATTING_2B * 2) + mb_train_init$TOTAL_BASES

mb_train_init$TOTAL_BASES <- (mb_train_init$TEAM_BATTING_3B * 3) + mb_train_init$TOTAL_BASES

mb_train_init$TOTAL_BASES <- (mb_train_init$TEAM_BATTING_HR * 4) + mb_train_init$TOTAL_BASES

head(mb_train_init)
##   INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1     1          39           1445             194              39
## 2     2          70           1339             219              22
## 3     3          86           1377             232              35
## 4     4          70           1387             209              38
## 5     5          82           1297             186              27
## 6     6          75           1279             200              36
##   TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1              13             143             842              NA
## 2             190             685            1075              37
## 3             137             602             917              46
## 4              96             451             922              43
## 5             102             472             920              49
## 6              92             443             973             107
##   TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1              NA               NA            9364               84
## 2              28               NA            1347              191
## 3              27               NA            1377              137
## 4              30               NA            1396               97
## 5              39               NA            1297              102
## 6              59               NA            1279               92
##   TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1              927             5456            1011               NA
## 2              689             1082             193              155
## 3              602              917             175              153
## 4              454              928             164              156
## 5              472              920             138              168
## 6              443              973             123              149
##   TEAM_BATTING_1B TOTAL_BASES
## 1            1199        1756
## 2             908        2172
## 3             973        2090
## 4            1044        1960
## 5             982        1843
## 6             951        1827
ggplot(mb_train_init, aes(x=TOTAL_BASES, y=TARGET_WINS)) + 
  geom_point()+
  geom_smooth(method=lm)
## `geom_smooth()` using formula 'y ~ x'

# How many instances have 0 wins
zero_wins <- subset(mb_train_init, TARGET_WINS == 0)

head(zero_wins)
##      INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1211  1347           0            891             135               0
##      TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1211               0               0               0               0
##      TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1211               0               NA           24057                0
##      TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1211                0                0            1890               NA
##      TEAM_BATTING_1B TOTAL_BASES
## 1211             756        1026
# Appears to be 1 instance has zero wins
# Looking at the data, appears bogus
# Remove really bad pitching
bad_pitching <-subset(mb_train_init, TEAM_PITCHING_H > 2000)

summary(bad_pitching)
##      INDEX       TARGET_WINS     TEAM_BATTING_H TEAM_BATTING_2B
##  Min.   :   1   Min.   :  0.00   Min.   : 891   Min.   : 69.0  
##  1st Qu.: 462   1st Qu.: 71.00   1st Qu.:1450   1st Qu.:206.0  
##  Median :1188   Median : 87.00   Median :1576   Median :237.0  
##  Mean   :1236   Mean   : 82.08   Mean   :1609   Mean   :242.7  
##  3rd Qu.:2035   3rd Qu.:100.00   3rd Qu.:1712   3rd Qu.:280.0  
##  Max.   :2535   Max.   :146.00   Max.   :2554   Max.   :458.0  
##                                                                
##  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB TEAM_BATTING_SO 
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.0   Min.   :   0.0  
##  1st Qu.: 51.00   1st Qu.: 15.00   1st Qu.:174.0   1st Qu.: 319.0  
##  Median : 82.00   Median : 33.00   Median :276.0   Median : 456.0  
##  Mean   : 81.26   Mean   : 50.32   Mean   :341.4   Mean   : 497.6  
##  3rd Qu.:107.00   3rd Qu.: 62.00   3rd Qu.:523.0   3rd Qu.: 685.0  
##  Max.   :223.00   Max.   :239.00   Max.   :819.0   Max.   :1264.0  
##                                                                    
##  TEAM_BASERUN_SB TEAM_BASERUN_CS  TEAM_BATTING_HBP TEAM_PITCHING_H
##  Min.   :  0.0   Min.   :  0.00   Min.   : NA      Min.   : 2003  
##  1st Qu.:107.2   1st Qu.: 43.00   1st Qu.: NA      1st Qu.: 2130  
##  Median :198.5   Median : 54.00   Median : NA      Median : 2412  
##  Mean   :221.2   Mean   : 53.03   Mean   :NaN      Mean   : 3810  
##  3rd Qu.:318.0   3rd Qu.: 67.00   3rd Qu.: NA      3rd Qu.: 3861  
##  Max.   :697.0   Max.   :118.00   Max.   : NA      Max.   :30132  
##  NA's   :123     NA's   :192      NA's   :257                     
##  TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E 
##  Min.   :  0.0    Min.   :   0.0   Min.   :    0.0   Min.   :  82.0  
##  1st Qu.: 33.0    1st Qu.: 380.0   1st Qu.:  475.0   1st Qu.: 443.0  
##  Median : 55.0    Median : 553.0   Median :  845.0   Median : 660.0  
##  Mean   : 81.4    Mean   : 613.6   Mean   :  986.2   Mean   : 674.9  
##  3rd Qu.: 97.0    3rd Qu.: 752.0   3rd Qu.: 1156.0   3rd Qu.: 894.0  
##  Max.   :343.0    Max.   :3645.0   Max.   :19278.0   Max.   :1898.0  
##                                                                      
##  TEAM_FIELDING_DP TEAM_BATTING_1B  TOTAL_BASES  
##  Min.   : 52.0    Min.   : 709    Min.   :1026  
##  1st Qu.: 97.0    1st Qu.:1068    1st Qu.:1934  
##  Median :127.0    Median :1208    Median :2159  
##  Mean   :124.5    Mean   :1234    Mean   :2165  
##  3rd Qu.:149.0    3rd Qu.:1356    3rd Qu.:2376  
##  Max.   :201.0    Max.   :2112    Max.   :3290  
##  NA's   :156
dim(bad_pitching)
## [1] 257  19
# Result is 257 entries, that seems like a lot of bad pitching
# Most wins in a season is 116, so let's see how many instance exist above that number
too_many_wins <- subset(mb_train_init, TARGET_WINS > 116)

summary(too_many_wins)
##      INDEX       TARGET_WINS    TEAM_BATTING_H TEAM_BATTING_2B
##  Min.   : 323   Min.   :117.0   Min.   :1403   Min.   :193.0  
##  1st Qu.: 462   1st Qu.:118.0   1st Qu.:1561   1st Qu.:221.0  
##  Median : 492   Median :122.0   Median :1689   Median :280.0  
##  Mean   :1112   Mean   :124.5   Mean   :1867   Mean   :286.4  
##  3rd Qu.:2034   3rd Qu.:128.0   3rd Qu.:2273   3rd Qu.:322.0  
##  Max.   :2250   Max.   :146.0   Max.   :2554   Max.   :393.0  
##                                                               
##  TEAM_BATTING_3B TEAM_BATTING_HR  TEAM_BATTING_BB TEAM_BATTING_SO
##  Min.   : 41.0   Min.   :  8.00   Min.   : 64.0   Min.   :  0.0  
##  1st Qu.: 76.0   1st Qu.: 22.00   1st Qu.:170.0   1st Qu.: 78.0  
##  Median :108.0   Median : 29.00   Median :266.0   Median :419.0  
##  Mean   :100.5   Mean   : 41.88   Mean   :313.6   Mean   :349.2  
##  3rd Qu.:119.0   3rd Qu.: 46.00   3rd Qu.:477.0   3rd Qu.:645.5  
##  Max.   :156.0   Max.   :164.00   Max.   :670.0   Max.   :777.0  
##                                                   NA's   :2      
##  TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
##  Min.   : 32.0   Min.   :35.00   Min.   : NA      Min.   : 1495  
##  1st Qu.:148.2   1st Qu.:41.50   1st Qu.: NA      1st Qu.: 2066  
##  Median :228.0   Median :48.00   Median : NA      Median : 2570  
##  Mean   :207.6   Mean   :52.33   Mean   :NaN      Mean   : 4112  
##  3rd Qu.:284.0   3rd Qu.:61.00   3rd Qu.: NA      3rd Qu.: 5253  
##  Max.   :324.0   Max.   :74.00   Max.   : NA      Max.   :13724  
##  NA's   :9       NA's   :14      NA's   :17                      
##  TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E 
##  Min.   : 15.00   Min.   : 131.0   Min.   :   0.0   Min.   : 135.0  
##  1st Qu.: 33.00   1st Qu.: 371.0   1st Qu.: 225.5   1st Qu.: 479.0  
##  Median : 59.00   Median : 467.0   Median : 652.0   Median : 692.0  
##  Mean   : 79.12   Mean   : 521.9   Mean   : 547.0   Mean   : 678.6  
##  3rd Qu.: 97.00   3rd Qu.: 628.0   3rd Qu.: 841.5   3rd Qu.: 928.0  
##  Max.   :301.00   Max.   :1539.0   Max.   :1114.0   Max.   :1192.0  
##                                    NA's   :2                        
##  TEAM_FIELDING_DP TEAM_BATTING_1B  TOTAL_BASES  
##  Min.   : 79.0    Min.   :1036    Min.   :1811  
##  1st Qu.: 86.0    1st Qu.:1115    1st Qu.:2148  
##  Median :104.0    Median :1291    Median :2292  
##  Mean   :105.2    Mean   :1438    Mean   :2480  
##  3rd Qu.:107.0    3rd Qu.:1795    3rd Qu.:2936  
##  Max.   :156.0    Max.   :2016    Max.   :3290  
##  NA's   :11
dim(too_many_wins)
## [1] 17 19
# Answer is 17
# Note: I have not included BATTING HBP because this causes over 2000 observations to be removed
model <- lm(TARGET_WINS ~ TEAM_BATTING_1B + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR +
              TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
              TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
              TEAM_FIELDING_DP, 
            data=mb_train_init)

summary(model)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_1B + TEAM_BATTING_2B + 
##     TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + 
##     TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + 
##     TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, 
##     data = mb_train_init)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -30.5627  -6.6932  -0.1328   6.5249  27.8525 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      57.912438   6.642839   8.718  < 2e-16 ***
## TEAM_BATTING_1B   0.015434   0.019626   0.786   0.4318    
## TEAM_BATTING_2B  -0.055039   0.020556  -2.677   0.0075 ** 
## TEAM_BATTING_3B   0.176985   0.028193   6.278 4.52e-10 ***
## TEAM_BATTING_HR   0.089385   0.090409   0.989   0.3230    
## TEAM_BATTING_BB   0.043765   0.046454   0.942   0.3463    
## TEAM_BATTING_SO   0.018250   0.023463   0.778   0.4368    
## TEAM_BASERUN_SB   0.035880   0.008687   4.130 3.83e-05 ***
## TEAM_BASERUN_CS   0.052124   0.018227   2.860   0.0043 ** 
## TEAM_PITCHING_H   0.019044   0.018381   1.036   0.3003    
## TEAM_PITCHING_HR  0.022997   0.082092   0.280   0.7794    
## TEAM_PITCHING_BB -0.004180   0.044692  -0.094   0.9255    
## TEAM_PITCHING_SO -0.038176   0.022447  -1.701   0.0892 .  
## TEAM_FIELDING_E  -0.155876   0.009946 -15.672  < 2e-16 ***
## TEAM_FIELDING_DP -0.112885   0.013137  -8.593  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.556 on 1471 degrees of freedom
##   (790 observations deleted due to missingness)
## Multiple R-squared:  0.4386, Adjusted R-squared:  0.4333 
## F-statistic:  82.1 on 14 and 1471 DF,  p-value: < 2.2e-16
sigma(model) / mean(mb_train_init$TARGET_WINS)
## [1] 0.1182788
# Note: Only include significant variables
model_sig <- lm(TARGET_WINS ~ TEAM_BATTING_2B + TEAM_BATTING_3B +
              TEAM_BASERUN_SB + TEAM_BASERUN_CS +
              TEAM_FIELDING_E + 
              TEAM_FIELDING_DP, 
            data=mb_train_init)

summary(model_sig)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_FIELDING_E + TEAM_FIELDING_DP, 
##     data = mb_train_init)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -34.687  -7.955  -0.154   8.008  37.873 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      92.152078   3.534132  26.075  < 2e-16 ***
## TEAM_BATTING_2B   0.024128   0.007505   3.215  0.00133 ** 
## TEAM_BATTING_3B   0.269509   0.021720  12.408  < 2e-16 ***
## TEAM_BASERUN_SB   0.019670   0.010031   1.961  0.05008 .  
## TEAM_BASERUN_CS   0.002149   0.021423   0.100  0.92012    
## TEAM_FIELDING_E  -0.162860   0.011250 -14.477  < 2e-16 ***
## TEAM_FIELDING_DP -0.048528   0.015288  -3.174  0.00153 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.47 on 1479 degrees of freedom
##   (790 observations deleted due to missingness)
## Multiple R-squared:  0.1875, Adjusted R-squared:  0.1842 
## F-statistic: 56.89 on 6 and 1479 DF,  p-value: < 2.2e-16
sigma(model_sig) / mean(mb_train_init$TARGET_WINS)
## [1] 0.1419097
# Replace all hits as total bases
model_tb <- lm(TARGET_WINS ~ TOTAL_BASES +
              TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
              TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
              TEAM_FIELDING_DP, 
            data=mb_train_init)

summary(model_tb)
## 
## Call:
## lm(formula = TARGET_WINS ~ TOTAL_BASES + TEAM_BATTING_BB + TEAM_BATTING_SO + 
##     TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + 
##     TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, 
##     data = mb_train_init)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -31.988  -6.708   0.014   6.523  29.819 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      70.338873   6.504527  10.814  < 2e-16 ***
## TOTAL_BASES      -0.004803   0.006700  -0.717 0.473567    
## TEAM_BATTING_BB   0.068653   0.035606   1.928 0.054032 .  
## TEAM_BATTING_SO   0.019373   0.020869   0.928 0.353390    
## TEAM_BASERUN_SB   0.032802   0.009004   3.643 0.000279 ***
## TEAM_BASERUN_CS   0.070570   0.018849   3.744 0.000188 ***
## TEAM_PITCHING_H   0.027246   0.009711   2.806 0.005086 ** 
## TEAM_PITCHING_HR  0.100054   0.019899   5.028 5.56e-07 ***
## TEAM_PITCHING_BB -0.031004   0.034030  -0.911 0.362401    
## TEAM_PITCHING_SO -0.047653   0.019806  -2.406 0.016252 *  
## TEAM_FIELDING_E  -0.125710   0.009863 -12.745  < 2e-16 ***
## TEAM_FIELDING_DP -0.109698   0.013625  -8.051 1.68e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.927 on 1474 degrees of freedom
##   (790 observations deleted due to missingness)
## Multiple R-squared:  0.393,  Adjusted R-squared:  0.3884 
## F-statistic: 86.74 on 11 and 1474 DF,  p-value: < 2.2e-16
# Did not prove helpful, but leave for now
sigma(model_tb) / mean(mb_train_init$TARGET_WINS)
## [1] 0.1228704

The lower the RSE, the more accurate the model

Overall, the F-statistic p-value: < 2.2e-16, which is highly significant

Added all the potential predictor variables, so the results show these variables are significant based on checking to see if t-value is significantly different from zero … those that match significance

TEAM_FIELDING_E -0.17204 0.04140 -4.155 5.08e-05 * TEAM_FIELDING_DP -0.10819 0.03654 -2.961 0.00349 TEAM_BATTING_3B -0.10118 0.07751 -1.305 0.19348 TEAM_BATTING_BB -4.45969 3.63624 -1.226 0.22167 TEAM_BASERUN_SB 0.03304 0.02867 1.152 0.25071 TEAM_BATTING_HBP 0.08247 0.04960 1.663 0.09815 TEAM_PITCHING_BB 4.51089 3.63372 1.241 0.21612

So let’s start removing some data from the data based on questionable data

# Drop column for Batting HBP
mb_train_init_clean <- subset(mb_train_init, select = -c(TEAM_BATTING_HBP)) 

# Remove entries with too many wins
mb_train_init_clean <- subset(mb_train_init_clean, TARGET_WINS <= 116)

# Remove entries with zero wins (1 total)
mb_train_init_clean <- subset(mb_train_init_clean, TARGET_WINS != 0)

# Remove entries with too many hits allowed
mb_train_init_clean <- subset(mb_train_init_clean, TEAM_PITCHING_H < 2000)

summary(mb_train_init_clean)
##      INDEX         TARGET_WINS     TEAM_BATTING_H TEAM_BATTING_2B
##  Min.   :   2.0   Min.   : 21.00   Min.   :1137   Min.   :118.0  
##  1st Qu.: 648.5   1st Qu.: 71.00   1st Qu.:1377   1st Qu.:208.0  
##  Median :1279.0   Median : 82.00   Median :1446   Median :239.0  
##  Mean   :1272.6   Mean   : 80.55   Mean   :1451   Mean   :241.1  
##  3rd Qu.:1902.5   3rd Qu.: 91.00   3rd Qu.:1523   3rd Qu.:272.0  
##  Max.   :2534.0   Max.   :116.00   Max.   :1876   Max.   :392.0  
##                                                                  
##  TEAM_BATTING_3B  TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO 
##  Min.   : 11.00   Min.   :  3.0   Min.   :171.0   Min.   : 268.0  
##  1st Qu.: 33.00   1st Qu.: 54.0   1st Qu.:461.5   1st Qu.: 579.0  
##  Median : 45.00   Median :109.0   Median :519.0   Median : 791.0  
##  Mean   : 51.88   Mean   :106.0   Mean   :521.9   Mean   : 767.7  
##  3rd Qu.: 66.00   3rd Qu.:149.5   3rd Qu.:583.0   3rd Qu.: 943.0  
##  Max.   :147.00   Max.   :264.0   Max.   :878.0   Max.   :1399.0  
##                                                   NA's   :100     
##  TEAM_BASERUN_SB TEAM_BASERUN_CS  TEAM_PITCHING_H TEAM_PITCHING_HR
##  Min.   : 18.0   Min.   : 11.00   Min.   :1137    Min.   :  3.0   
##  1st Qu.: 65.0   1st Qu.: 38.00   1st Qu.:1408    1st Qu.: 58.0   
##  Median : 99.0   Median : 49.00   Median :1494    Median :112.0   
##  Mean   :118.2   Mean   : 52.81   Mean   :1521    Mean   :108.9   
##  3rd Qu.:149.0   3rd Qu.: 62.00   3rd Qu.:1608    3rd Qu.:152.0   
##  Max.   :654.0   Max.   :201.00   Max.   :1999    Max.   :264.0   
##  NA's   :8       NA's   :577                                      
##  TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
##  Min.   :247.0    Min.   : 301.0   Min.   : 65.0   Min.   : 68.0   
##  1st Qu.:482.0    1st Qu.: 626.0   1st Qu.:125.0   1st Qu.:133.0   
##  Median :536.0    Median : 811.0   Median :153.0   Median :149.0   
##  Mean   :545.3    Mean   : 795.3   Mean   :191.9   Mean   :147.6   
##  3rd Qu.:601.5    3rd Qu.: 953.5   3rd Qu.:212.0   3rd Qu.:164.0   
##  Max.   :929.0    Max.   :1659.0   Max.   :796.0   Max.   :228.0   
##                   NA's   :100                      NA's   :130     
##  TEAM_BATTING_1B  TOTAL_BASES  
##  Min.   : 811    Min.   :1453  
##  1st Qu.: 985    1st Qu.:1950  
##  Median :1042    Median :2123  
##  Mean   :1052    Mean   :2114  
##  3rd Qu.:1109    3rd Qu.:2276  
##  Max.   :1458    Max.   :2832  
## 
dim(mb_train_init_clean)
## [1] 2015   18

Split the initial training data into train and validation

# https://machinelearningmastery.com/machine-learning-in-r-step-by-step/

# create a list of 80% of the rows in the original dataset we can use for training
validation_index <- createDataPartition(mb_train_init_clean$TARGET_WINS, p=0.80, list=FALSE)
# select 20% of the data for validation
mb_valid_clean <- mb_train_init_clean[-validation_index,]
# use the remaining 80% of data to training and testing the models
mb_train_clean <- mb_train_init_clean[validation_index,]

Attempt linear model on cleaned data

model_clean <- lm(TARGET_WINS ~ TEAM_BATTING_1B + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR +
              TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
              TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
              TEAM_FIELDING_DP, 
            data=mb_train_clean,
            na.action = na.omit)

summary(model_clean)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_1B + TEAM_BATTING_2B + 
##     TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + 
##     TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + 
##     TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, 
##     data = mb_train_clean, na.action = na.omit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -30.8782  -6.5006  -0.0018   6.4110  28.9285 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      58.201065   7.641781   7.616 5.54e-14 ***
## TEAM_BATTING_1B   0.048577   0.048556   1.000 0.317311    
## TEAM_BATTING_2B  -0.008746   0.049586  -0.176 0.860020    
## TEAM_BATTING_3B   0.203229   0.056923   3.570 0.000372 ***
## TEAM_BATTING_HR  -0.217925   0.241232  -0.903 0.366517    
## TEAM_BATTING_BB   0.100544   0.111822   0.899 0.368771    
## TEAM_BATTING_SO  -0.015440   0.053202  -0.290 0.771701    
## TEAM_BASERUN_SB   0.027806   0.010283   2.704 0.006952 ** 
## TEAM_BASERUN_CS   0.066262   0.021038   3.150 0.001678 ** 
## TEAM_PITCHING_H  -0.016064   0.046810  -0.343 0.731525    
## TEAM_PITCHING_HR  0.352839   0.221205   1.595 0.110977    
## TEAM_PITCHING_BB -0.059293   0.108798  -0.545 0.585878    
## TEAM_PITCHING_SO -0.006510   0.051740  -0.126 0.899893    
## TEAM_FIELDING_E  -0.146791   0.011822 -12.416  < 2e-16 ***
## TEAM_FIELDING_DP -0.105609   0.015068  -7.009 4.14e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.594 on 1120 degrees of freedom
##   (479 observations deleted due to missingness)
## Multiple R-squared:  0.4191, Adjusted R-squared:  0.4118 
## F-statistic: 57.71 on 14 and 1120 DF,  p-value: < 2.2e-16
sig <- sigma(model_clean)

sig
## [1] 9.593717
sig / mean(mb_train_clean$TARGET_WINS)
## [1] 0.1190831

Stepwise improvement of cleaned dataset

# STEP 1
# Removed TEAM_BATTING_SO
model_clean_st1 <- lm(TARGET_WINS ~ TEAM_BATTING_1B + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR +
              TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
              TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
              TEAM_FIELDING_DP, 
            data=mb_train_clean,
            na.action = na.omit)

summary(model_clean_st1)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_1B + TEAM_BATTING_2B + 
##     TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB + 
##     TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + 
##     TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, data = mb_train_clean, 
##     na.action = na.omit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -30.8526  -6.5244   0.0445   6.4400  28.9837 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      58.306977   7.629944   7.642 4.58e-14 ***
## TEAM_BATTING_1B   0.040793   0.040458   1.008  0.31354    
## TEAM_BATTING_2B  -0.016786   0.041109  -0.408  0.68310    
## TEAM_BATTING_3B   0.194184   0.047613   4.078 4.86e-05 ***
## TEAM_BATTING_HR  -0.261124   0.189758  -1.376  0.16907    
## TEAM_BATTING_BB   0.103590   0.111283   0.931  0.35212    
## TEAM_BASERUN_SB   0.027902   0.010273   2.716  0.00671 ** 
## TEAM_BASERUN_CS   0.066802   0.020947   3.189  0.00147 ** 
## TEAM_PITCHING_H  -0.008487   0.038836  -0.219  0.82705    
## TEAM_PITCHING_HR  0.387751   0.185562   2.090  0.03688 *  
## TEAM_PITCHING_BB -0.062413   0.108221  -0.577  0.56425    
## TEAM_PITCHING_SO -0.021504   0.002810  -7.652 4.24e-14 ***
## TEAM_FIELDING_E  -0.147181   0.011741 -12.536  < 2e-16 ***
## TEAM_FIELDING_DP -0.105623   0.015062  -7.013 4.04e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.59 on 1121 degrees of freedom
##   (479 observations deleted due to missingness)
## Multiple R-squared:  0.419,  Adjusted R-squared:  0.4123 
## F-statistic: 62.19 on 13 and 1121 DF,  p-value: < 2.2e-16
sig <- sigma(model_clean_st1)

sig
## [1] 9.589798
sig / mean(mb_train_clean$TARGET_WINS)
## [1] 0.1190345
# STEP 2
# Removed TEAM_PITCHING_H
model_clean_st2 <- lm(TARGET_WINS ~ TEAM_BATTING_1B + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR +
              TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
              TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
              TEAM_FIELDING_DP, 
            data=mb_train_clean,
            na.action = na.omit)

summary(model_clean_st2)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_1B + TEAM_BATTING_2B + 
##     TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB + 
##     TEAM_BASERUN_CS + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP, data = mb_train_clean, 
##     na.action = na.omit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -30.8110  -6.5213   0.0232   6.4470  29.0552 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      58.445959   7.600165   7.690 3.20e-14 ***
## TEAM_BATTING_1B   0.032029   0.005341   5.997 2.70e-09 ***
## TEAM_BATTING_2B  -0.025587   0.008253  -3.100  0.00198 ** 
## TEAM_BATTING_3B   0.185187   0.023914   7.744 2.14e-14 ***
## TEAM_BATTING_HR  -0.264818   0.188923  -1.402  0.16128    
## TEAM_BATTING_BB   0.125298   0.050147   2.499  0.01261 *  
## TEAM_BASERUN_SB   0.028118   0.010221   2.751  0.00604 ** 
## TEAM_BASERUN_CS   0.066391   0.020853   3.184  0.00149 ** 
## TEAM_PITCHING_HR  0.382881   0.184140   2.079  0.03782 *  
## TEAM_PITCHING_BB -0.083558   0.048453  -1.725  0.08489 .  
## TEAM_PITCHING_SO -0.021550   0.002801  -7.693 3.14e-14 ***
## TEAM_FIELDING_E  -0.147510   0.011639 -12.674  < 2e-16 ***
## TEAM_FIELDING_DP -0.105432   0.015030  -7.015 3.98e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.586 on 1122 degrees of freedom
##   (479 observations deleted due to missingness)
## Multiple R-squared:  0.419,  Adjusted R-squared:  0.4128 
## F-statistic: 67.43 on 12 and 1122 DF,  p-value: < 2.2e-16
sig <- sigma(model_clean_st2)

sig
## [1] 9.585728
sig / mean(mb_train_clean$TARGET_WINS)
## [1] 0.118984
AIC(model_clean_st2)
BIC(model_clean_st2)
# STEP 3
# Removed TEAM_BATTING_HR
model_clean_st3 <- lm(TARGET_WINS ~ TEAM_BATTING_1B + TEAM_BATTING_2B + TEAM_BATTING_3B +
              TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
              TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
              TEAM_FIELDING_DP, 
            data=mb_train_clean,
            na.action = na.omit)

summary(model_clean_st3)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_1B + TEAM_BATTING_2B + 
##     TEAM_BATTING_3B + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS + 
##     TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP, data = mb_train_clean, 
##     na.action = na.omit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -30.7825  -6.6518  -0.1376   6.3787  28.0915 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      58.295477   7.602670   7.668 3.78e-14 ***
## TEAM_BATTING_1B   0.032529   0.005331   6.102 1.44e-09 ***
## TEAM_BATTING_2B  -0.028519   0.007987  -3.571 0.000371 ***
## TEAM_BATTING_3B   0.185136   0.023924   7.738 2.23e-14 ***
## TEAM_BATTING_BB   0.058641   0.015925   3.682 0.000242 ***
## TEAM_BASERUN_SB   0.029877   0.010148   2.944 0.003306 ** 
## TEAM_BASERUN_CS   0.063748   0.020777   3.068 0.002205 ** 
## TEAM_PITCHING_HR  0.125084   0.009129  13.703  < 2e-16 ***
## TEAM_PITCHING_BB -0.018832   0.014685  -1.282 0.199963    
## TEAM_PITCHING_SO -0.021018   0.002777  -7.570 7.78e-14 ***
## TEAM_FIELDING_E  -0.149394   0.011566 -12.917  < 2e-16 ***
## TEAM_FIELDING_DP -0.104712   0.015028  -6.968 5.47e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.59 on 1123 degrees of freedom
##   (479 observations deleted due to missingness)
## Multiple R-squared:  0.418,  Adjusted R-squared:  0.4123 
## F-statistic: 73.32 on 11 and 1123 DF,  p-value: < 2.2e-16
sig <- sigma(model_clean_st3)

sig
## [1] 9.589845
sig / mean(mb_train_clean$TARGET_WINS)
## [1] 0.1190351

Continually the best result is Step 3

# STEP 4
# Removed TEAM_PITCHING_BB
model_clean_st4 <- lm(TARGET_WINS ~ TEAM_BATTING_1B + TEAM_BATTING_2B + TEAM_BATTING_3B +
              TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
              TEAM_PITCHING_HR + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
              TEAM_FIELDING_DP, 
            data=mb_train_clean,
            na.action = na.omit)

summary(model_clean_st4)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_1B + TEAM_BATTING_2B + 
##     TEAM_BATTING_3B + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS + 
##     TEAM_PITCHING_HR + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, 
##     data = mb_train_clean, na.action = na.omit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -30.8522  -6.5628  -0.1001   6.4270  27.9040 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      59.935980   7.496424   7.995 3.19e-15 ***
## TEAM_BATTING_1B   0.032178   0.005325   6.042 2.06e-09 ***
## TEAM_BATTING_2B  -0.028138   0.007983  -3.525 0.000441 ***
## TEAM_BATTING_3B   0.178552   0.023374   7.639 4.67e-14 ***
## TEAM_BATTING_BB   0.038822   0.003845  10.097  < 2e-16 ***
## TEAM_BASERUN_SB   0.031367   0.010084   3.110 0.001915 ** 
## TEAM_BASERUN_CS   0.062814   0.020770   3.024 0.002549 ** 
## TEAM_PITCHING_HR  0.124755   0.009128  13.668  < 2e-16 ***
## TEAM_PITCHING_SO -0.021315   0.002768  -7.701 2.95e-14 ***
## TEAM_FIELDING_E  -0.151741   0.011424 -13.283  < 2e-16 ***
## TEAM_FIELDING_DP -0.106320   0.014980  -7.098 2.24e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.593 on 1124 degrees of freedom
##   (479 observations deleted due to missingness)
## Multiple R-squared:  0.4171, Adjusted R-squared:  0.4119 
## F-statistic: 80.44 on 10 and 1124 DF,  p-value: < 2.2e-16
sig <- sigma(model_clean_st4)

sig
## [1] 9.592594
sig / mean(mb_train_clean$TARGET_WINS)
## [1] 0.1190692
plot(model_clean_st4)

# Graph residuals

ggplot(data=model_clean_st3, aes(model_clean_st3$residuals)) +
  geom_histogram(binwidth = 1, color = "black", fill = "purple4") +
  theme(panel.background = element_rect(fill = "white"),
        axis.line.x=element_line(),
        axis.line.y=element_line()) +
  ggtitle("Histogram for Model Residuals")

# Run the step wise approach using the stepAIC function
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
stepmodel <- stepAIC(model_clean, direction = c("both"), trace = FALSE)

summary(stepmodel)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_1B + TEAM_BATTING_3B + 
##     TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_BASERUN_CS + 
##     TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_FIELDING_E + TEAM_FIELDING_DP, 
##     data = mb_train_clean, na.action = na.omit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -30.8918  -6.6459  -0.1023   6.5008  28.1725 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      58.313645   7.578559   7.695 3.09e-14 ***
## TEAM_BATTING_1B   0.056888   0.007474   7.612 5.72e-14 ***
## TEAM_BATTING_3B   0.211683   0.025403   8.333 2.28e-16 ***
## TEAM_BATTING_BB   0.039484   0.003847  10.263  < 2e-16 ***
## TEAM_BATTING_SO  -0.022025   0.002811  -7.836 1.07e-14 ***
## TEAM_BASERUN_SB   0.028665   0.010062   2.849  0.00447 ** 
## TEAM_BASERUN_CS   0.064731   0.020712   3.125  0.00182 ** 
## TEAM_PITCHING_H  -0.024379   0.004864  -5.012 6.25e-07 ***
## TEAM_PITCHING_HR  0.148530   0.011372  13.061  < 2e-16 ***
## TEAM_FIELDING_E  -0.147211   0.011390 -12.924  < 2e-16 ***
## TEAM_FIELDING_DP -0.104964   0.014991  -7.002 4.35e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.582 on 1124 degrees of freedom
##   (479 observations deleted due to missingness)
## Multiple R-squared:  0.4184, Adjusted R-squared:  0.4132 
## F-statistic: 80.87 on 10 and 1124 DF,  p-value: < 2.2e-16
sig <- sigma(stepmodel)

sig
## [1] 9.582
sig / mean(mb_train_clean$TARGET_WINS)
## [1] 0.1189377
plot(stepmodel)

Impute missing data

https://www.analyticsvidhya.com/blog/2016/03/tutorial-powerful-packages-imputing-missing-values/

Using package MICE

library(mice)
## Warning: package 'mice' was built under R version 3.6.2
## 
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
# SKIP THIS SECTION

# Impute the missing data
mb_train_init_imputed <- mice(mb_train_init, m=5, maxit=50, method='pmm', seed=500)


summary(mb_train_init_imputed)

mb_train_imp_2 <- complete(mb_train_init_imputed,2)

# create a list of 80% of the rows in the original dataset we can use for training
validation_index <- createDataPartition(mb_train_imp_2$TARGET_WINS, p=0.80, list=FALSE)
# select 20% of the data for validation
mb_valid_imp <- mb_train_imp_2[-validation_index,]
# use the remaining 80% of data to training and testing the models
mb_train_imp <- mb_train_imp_2[validation_index,]
# output one of the imputed dataframes
complete_data_2 <- complete(mb_train_init_imputed,2)

head(complete_data_2)
head(mb_train_init)
# build predictive model
fit <- with(data=mb_train_init_imputed, exp = lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
                                              TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB + 
                                              TEAM_BASERUN_CS + TEAM_PITCHING_HR + TEAM_PITCHING_BB + 
                                              TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, 
                                            data=mb_train_clean,
                                            na.action = na.omit))

summary(fit)
# combine results of all 5 models
pooled <- pool(fit)
summary(pooled)
# https://stackoverflow.com/questions/52713733/how-to-use-predict-function-with-my-pooled-results-from-mice


# Copy one of the fitted lm models fit to
#   one of the imputed datasets
pooled_lm = fit$analyses[[1]]
# Replace the fitted coefficients with the pooled
#   estimates (need to check they are replaced in
#   the correct order)
pooled_lm$coefficients = summary(pooled)$estimate

# Predict - predictions seem to match the
#   pooled coefficients rather than the original
#   lm that was copied
# predict(fit$analyses[[1]], newdata = nhanes)
wins_pred_imp <- predict(pooled_lm, newdata = mb_valid_imp)

wins_pred_imp

actual_preds_imp <- data.frame(cbind(actuals=mb_valid_imp$TARGET_WINS, predicteds=wins_pred_imp))

# actual_preds_imp <- subset(actual_preds_imp, predicteds > 0)

actual_preds_imp
correlation_accuracy <- cor(actual_preds_imp)

correlation_accuracy
mape <- MAPE(actual_preds_imp$predicteds, actual_preds_imp$actuals)

# lower is better
# mape 0.1397487 after removal of negative wins
# 0.3172925 with the negative scores
mape

Run against evaluation data

# Read in CSV file of evaluation data
mb_eval <- read.csv("moneyball-evaluation-data.csv")
# Dimensions of the dataset
dim(mb_eval)
## [1] 259  16
# list types of each attribute
sapply(mb_eval, class)
##            INDEX   TEAM_BATTING_H  TEAM_BATTING_2B  TEAM_BATTING_3B 
##        "integer"        "integer"        "integer"        "integer" 
##  TEAM_BATTING_HR  TEAM_BATTING_BB  TEAM_BATTING_SO  TEAM_BASERUN_SB 
##        "integer"        "integer"        "integer"        "integer" 
##  TEAM_BASERUN_CS TEAM_BATTING_HBP  TEAM_PITCHING_H TEAM_PITCHING_HR 
##        "integer"        "integer"        "integer"        "integer" 
## TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E TEAM_FIELDING_DP 
##        "integer"        "integer"        "integer"        "integer"
# first five rows
head(mb_eval)
##   INDEX TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR
## 1     9           1209             170              33              83
## 2    10           1221             151              29              88
## 3    14           1395             183              29              93
## 4    47           1539             309              29             159
## 5    60           1445             203              68               5
## 6    63           1431             236              53              10
##   TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS
## 1             447            1080              62              50
## 2             516             929              54              39
## 3             509             816              59              47
## 4             486             914             148              57
## 5              95             416              NA              NA
## 6             215             377              NA              NA
##   TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB
## 1               NA            1209               83              447
## 2               NA            1221               88              516
## 3               NA            1395               93              509
## 4               42            1539              159              486
## 5               NA            3902               14              257
## 6               NA            2793               20              420
##   TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1             1080             140              156
## 2              929             135              164
## 3              816             156              153
## 4              914             124              154
## 5             1123             616              130
## 6              736             572              105
# summary
summary(mb_eval)
##      INDEX      TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B 
##  Min.   :   9   Min.   : 819   Min.   : 44.0   Min.   : 14.00  
##  1st Qu.: 708   1st Qu.:1387   1st Qu.:210.0   1st Qu.: 35.00  
##  Median :1249   Median :1455   Median :239.0   Median : 52.00  
##  Mean   :1264   Mean   :1469   Mean   :241.3   Mean   : 55.91  
##  3rd Qu.:1832   3rd Qu.:1548   3rd Qu.:278.5   3rd Qu.: 72.00  
##  Max.   :2525   Max.   :2170   Max.   :376.0   Max.   :155.00  
##                                                                
##  TEAM_BATTING_HR  TEAM_BATTING_BB TEAM_BATTING_SO  TEAM_BASERUN_SB
##  Min.   :  0.00   Min.   : 15.0   Min.   :   0.0   Min.   :  0.0  
##  1st Qu.: 44.50   1st Qu.:436.5   1st Qu.: 545.0   1st Qu.: 59.0  
##  Median :101.00   Median :509.0   Median : 686.0   Median : 92.0  
##  Mean   : 95.63   Mean   :499.0   Mean   : 709.3   Mean   :123.7  
##  3rd Qu.:135.50   3rd Qu.:565.5   3rd Qu.: 912.0   3rd Qu.:151.8  
##  Max.   :242.00   Max.   :792.0   Max.   :1268.0   Max.   :580.0  
##                                   NA's   :18       NA's   :13     
##  TEAM_BASERUN_CS  TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
##  Min.   :  0.00   Min.   :42.00    Min.   : 1155   Min.   :  0.0   
##  1st Qu.: 38.00   1st Qu.:53.50    1st Qu.: 1426   1st Qu.: 52.0   
##  Median : 49.50   Median :62.00    Median : 1515   Median :104.0   
##  Mean   : 52.32   Mean   :62.37    Mean   : 1813   Mean   :102.1   
##  3rd Qu.: 63.00   3rd Qu.:67.50    3rd Qu.: 1681   3rd Qu.:142.5   
##  Max.   :154.00   Max.   :96.00    Max.   :22768   Max.   :336.0   
##  NA's   :87       NA's   :240                                      
##  TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E  TEAM_FIELDING_DP
##  Min.   : 136.0   Min.   :   0.0   Min.   :  73.0   Min.   : 69.0   
##  1st Qu.: 471.0   1st Qu.: 613.0   1st Qu.: 131.0   1st Qu.:131.0   
##  Median : 526.0   Median : 745.0   Median : 163.0   Median :148.0   
##  Mean   : 552.4   Mean   : 799.7   Mean   : 249.7   Mean   :146.1   
##  3rd Qu.: 606.5   3rd Qu.: 938.0   3rd Qu.: 252.0   3rd Qu.:164.0   
##  Max.   :2008.0   Max.   :9963.0   Max.   :1568.0   Max.   :204.0   
##                   NA's   :18                        NA's   :31
# Impute the date for the missing validation data
# Impute the missing data
mb_valid_clean_imputed <- mice(mb_valid_clean, m=5, maxit=50, method='pmm', seed=500)
## 
##  iter imp variable
##   1   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   1   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   1   3  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   1   4  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   1   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   2   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   2   2  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   2   3  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   2   4  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   2   5  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   3   1  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   3   2  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   3   3  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   3   4  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   3   5  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   4   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   4   2  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   4   3  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   4   4  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   4   5  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   5   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   5   2  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   5   3  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   5   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   5   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   6   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   6   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   6   3  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   6   4  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   6   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   7   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   7   2  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   7   3  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   7   4  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   7   5  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   8   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   8   2  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   8   3  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   8   4  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   8   5  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   9   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   9   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   9   3  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   9   4  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   9   5  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   10   1  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   10   2  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   10   3  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   10   4  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   10   5  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   11   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   11   2  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   11   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   11   4  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   11   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   12   1  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   12   2  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   12   3  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   12   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   12   5  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   13   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   13   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   13   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   13   4  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   13   5  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   14   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   14   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   14   3  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   14   4  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   14   5  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   15   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   15   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   15   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   15   4  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   15   5  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   16   1  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   16   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   16   3  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   16   4  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   16   5  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   17   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   17   2  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   17   3  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   17   4  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   17   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   18   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   18   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   18   3  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   18   4  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   18   5  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   19   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   19   2  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   19   3  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   19   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   19   5  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   20   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   20   2  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   20   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   20   4  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   20   5  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   21   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   21   2  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   21   3  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   21   4  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   21   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   22   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   22   2  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   22   3  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   22   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   22   5  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   23   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   23   2  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   23   3  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   23   4  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   23   5  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   24   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   24   2  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   24   3  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   24   4  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   24   5  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   25   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   25   2  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   25   3  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   25   4  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   25   5  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   26   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   26   2  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   26   3  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   26   4  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   26   5  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   27   1  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   27   2  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   27   3  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   27   4  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   27   5  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   28   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   28   2  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   28   3  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   28   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   28   5  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   29   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   29   2  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   29   3  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   29   4  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   29   5  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   30   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   30   2  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   30   3  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   30   4  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   30   5  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   31   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   31   2  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   31   3  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   31   4  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   31   5  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   32   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   32   2  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   32   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   32   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   32   5  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   33   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   33   2  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   33   3  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   33   4  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   33   5  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   34   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   34   2  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   34   3  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   34   4  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   34   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   35   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   35   2  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   35   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   35   4  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   35   5  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   36   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   36   2  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   36   3  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   36   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   36   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   37   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   37   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   37   3  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   37   4  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   37   5  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   38   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   38   2  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   38   3  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   38   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   38   5  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   39   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   39   2  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   39   3  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   39   4  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   39   5  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   40   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   40   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   40   3  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   40   4  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   40   5  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   41   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   41   2  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   41   3  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   41   4  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   41   5  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   42   1  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   42   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   42   3  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   42   4  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   42   5  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   43   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   43   2  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   43   3  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   43   4  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   43   5  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   44   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   44   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   44   3  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   44   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   44   5  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   45   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   45   2  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   45   3  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   45   4  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   45   5  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   46   1  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   46   2  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   46   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   46   4  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   46   5  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   47   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   47   2  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   47   3  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   47   4  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   47   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   48   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   48   2  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   48   3  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   48   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   48   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   49   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP*
##   49   2  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   49   3  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   49   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   49   5  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##   50   1  TEAM_BATTING_SO*  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   50   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   50   3  TEAM_BATTING_SO*  TEAM_BASERUN_SB  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   50   4  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP*
##   50   5  TEAM_BATTING_SO  TEAM_BASERUN_SB*  TEAM_BASERUN_CS*  TEAM_PITCHING_SO*  TEAM_FIELDING_DP
##  * Please inspect the loggedEvents
## Warning: Number of logged events: 2008
summary(mb_valid_clean_imputed)
## Class: mids
## Number of multiple imputations:  5 
## Imputation methods:
##            INDEX      TARGET_WINS   TEAM_BATTING_H  TEAM_BATTING_2B 
##               ""               ""               ""               "" 
##  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB  TEAM_BATTING_SO 
##               ""               ""               ""            "pmm" 
##  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_H TEAM_PITCHING_HR 
##            "pmm"            "pmm"               ""               "" 
## TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E TEAM_FIELDING_DP 
##               ""            "pmm"               ""            "pmm" 
##  TEAM_BATTING_1B      TOTAL_BASES 
##               ""               "" 
## PredictorMatrix:
##                 INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## INDEX               0           1              1               1
## TARGET_WINS         1           0              1               1
## TEAM_BATTING_H      1           1              0               1
## TEAM_BATTING_2B     1           1              1               0
## TEAM_BATTING_3B     1           1              1               1
## TEAM_BATTING_HR     1           1              1               1
##                 TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB
## INDEX                         1               1               1
## TARGET_WINS                   1               1               1
## TEAM_BATTING_H                1               1               1
## TEAM_BATTING_2B               1               1               1
## TEAM_BATTING_3B               0               1               1
## TEAM_BATTING_HR               1               0               1
##                 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS
## INDEX                         1               1               1
## TARGET_WINS                   1               1               1
## TEAM_BATTING_H                1               1               1
## TEAM_BATTING_2B               1               1               1
## TEAM_BATTING_3B               1               1               1
## TEAM_BATTING_HR               1               1               1
##                 TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB
## INDEX                         1                1                1
## TARGET_WINS                   1                1                1
## TEAM_BATTING_H                1                1                1
## TEAM_BATTING_2B               1                1                1
## TEAM_BATTING_3B               1                1                1
## TEAM_BATTING_HR               1                1                1
##                 TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## INDEX                          1               1                1
## TARGET_WINS                    1               1                1
## TEAM_BATTING_H                 1               1                1
## TEAM_BATTING_2B                1               1                1
## TEAM_BATTING_3B                1               1                1
## TEAM_BATTING_HR                1               1                1
##                 TEAM_BATTING_1B TOTAL_BASES
## INDEX                         1           1
## TARGET_WINS                   1           1
## TEAM_BATTING_H                1           1
## TEAM_BATTING_2B               1           1
## TEAM_BATTING_3B               1           1
## TEAM_BATTING_HR               1           1
## Number of logged events:  2008 
##   it im             dep meth
## 1  1  1 TEAM_BATTING_SO  pmm
## 2  1  1 TEAM_BATTING_SO  pmm
## 3  1  1 TEAM_BASERUN_SB  pmm
## 4  1  1 TEAM_BASERUN_SB  pmm
## 5  1  1 TEAM_BASERUN_CS  pmm
## 6  1  1 TEAM_BASERUN_CS  pmm
##                                                                                                                                                                                                                                                                                                                       out
## 1                                                                                                                                                                                                                                                                                         TEAM_BATTING_H, TEAM_BATTING_BB
## 2 * A ridge penalty had to be used to calculate the inverse crossproduct of the predictor matrix. Please remove duplicate variables or unique respondent names/numbers from the imputation model. It may be advisable to check the fraction of missing information (fmi) to evaluate the validity of the imputation model
## 3                                                                                                                                                                                                                                                                                                          TEAM_BATTING_H
## 4 * A ridge penalty had to be used to calculate the inverse crossproduct of the predictor matrix. Please remove duplicate variables or unique respondent names/numbers from the imputation model. It may be advisable to check the fraction of missing information (fmi) to evaluate the validity of the imputation model
## 5                                                                                                                                                                                                                                                     TEAM_BATTING_H, TEAM_PITCHING_BB, TEAM_PITCHING_SO, TEAM_FIELDING_E
## 6 * A ridge penalty had to be used to calculate the inverse crossproduct of the predictor matrix. Please remove duplicate variables or unique respondent names/numbers from the imputation model. It may be advisable to check the fraction of missing information (fmi) to evaluate the validity of the imputation model
mb_valid_clean_imputed_2 <- complete(mb_valid_clean_imputed,2)

# create a list of 80% of the rows in the original dataset we can use for training
#validation_index <- createDataPartition(mb_train_imp_2$TARGET_WINS, p=0.80, list=FALSE)
# select 20% of the data for validation
#mb_valid_imp <- mb_train_imp_2[-validation_index,]
# use the remaining 80% of data to training and testing the models
#mb_train_imp <- mb_train_imp_2[validation_index,]
# output one of the imputed dataframes
complete_data_2 <- complete(mb_train_init_imputed,2)

head(complete_data_2)
library(Metrics)
## 
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
## 
##     precision, recall
library(MLmetrics)
## 
## Attaching package: 'MLmetrics'
## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE
## The following object is masked from 'package:base':
## 
##     Recall
# switched mb_eval to mb_valid_clean - I want to see how good predictions are on known target wins

mb_valid_clean <- mb_valid_clean[complete.cases(mb_valid_clean), ]

wins_pred <- predict(model_clean_st3, mb_valid_clean_imputed_2)

# wins_pred

wins_pred_neg <- subset(wins_pred, wins_pred < 0)

wins_pred_neg
##       285 
## -23.70446
rmse <- rmse(mb_valid_clean_imputed_2$TARGET_WINS, wins_pred)

rmse
## [1] 15.87231
mape <- MAPE(mb_valid_clean_imputed_2$TARGET_WINS, wins_pred)

mape
## [1] 0.2325064
  • RMSE: 15.09709

  • MAPE: 0.2541715

wins_pred <- predict(stepmodel, mb_valid_clean_imputed_2)

# wins_pred

rmse <- rmse(mb_valid_clean_imputed_2$TARGET_WINS, wins_pred)

rmse
## [1] 16.25347
mape <- MAPE(mb_valid_clean_imputed_2$TARGET_WINS, wins_pred)

mape
## [1] 0.2836372
  • RMSE: 15.09709

  • MAPE: 0.2541715

# http://r-statistics.co/Linear-Regression.html
actual_preds <- data.frame(cbind(actuals=mb_valid_clean_imputed_2$TARGET_WINS, predicteds=wins_pred))
correlation_accuracy <- cor(actual_preds)

correlation_accuracy
##              actuals predicteds
## actuals    1.0000000  0.3987365
## predicteds 0.3987365  1.0000000
# actual_preds

Extra attempts at initial model

# STEP 2 TB (using Total Bases instead of hits individually)
# Removed TEAM_PITCHING_H
model_clean_st2tb <- lm(TARGET_WINS ~ TOTAL_BASES +
              TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
              TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
              TEAM_FIELDING_DP, 
            data=mb_train_clean,
            na.action = na.omit)

summary(model_clean_st2tb)
sigma(model_clean_st2tb) / mean(mb_train_clean$TARGET_WINS)
# STEP 3 TB
# Removed TEAM_PITCHING_BB
model_clean_st3tb <- lm(TARGET_WINS ~ TOTAL_BASES +
              TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
              TEAM_PITCHING_HR + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
              TEAM_FIELDING_DP, 
            data=mb_train_clean,
            na.action = na.omit)

summary(model_clean_st3tb)
# Got worse
sigma(model_clean_st3tb) / mean(mb_train_clean$TARGET_WINS)
# STEP 3 - Got worse
# Removed TEAM_BATTING_HR
model_clean_st3 <- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
              TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
              TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
              TEAM_FIELDING_DP, 
            data=mb_train_clean,
            na.action = na.omit)

summary(model_clean_st3)
sigma(model_clean_st3) / mean(mb_train_clean$TARGET_WINS)
fitted(model_clean)
model_2 <- lm(TARGET_WINS ~ TEAM_BATTING_3B + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BATTING_HBP + 
                TEAM_PITCHING_BB + TEAM_FIELDING_E + TEAM_FIELDING_DP, 
              data=mb_train)

summary(model_2)
sigma(model_2) / mean(mb_train$TARGET_WINS)

Got worse … ha, I knew some of the hitting would need to be included

Add up all the hits for total bases

http://www.philsbaseball.com/Articles/2010_to_2014/2014/September/total_base_percentage.php#:~:text=Here%20is%20the%20formula%3A%20Total,4)%20by%20at%2Dbats. Here is the formula: Total Bases + walks + hit-by-pitches + stolen bases – caught stealing divided by plate appearances.

model_TB <- lm(TARGET_WINS ~ TOTAL_BASES +
              TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_BATTING_HBP +
              TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
              TEAM_FIELDING_DP, 
            data=mb_train)

summary(model_TB)

sigma(model_TB) / mean(mb_train$TARGET_WINS)
mb_train$TOTAL_BASES_PLUS <- mb_train$TEAM_BATTING_BB + mb_train$TOTAL_BASES

head(mb_train)
model_TB_plus <- lm(TARGET_WINS ~ TOTAL_BASES_PLUS +
              TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_BATTING_HBP +
              TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
              TEAM_FIELDING_DP, 
            data=mb_train)

summary(model_TB_plus)

sigma(model_TB_plus) / mean(mb_train$TARGET_WINS)